Импорт данных и библиотек¶

In [1]:
# !pip install geopandas
# !pip install h3
# !pip install folium
# !pip install osmnx
# !pip install geojson
In [2]:
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import json
import h3
import folium
import osmnx as ox
from shapely import wkt
from folium.plugins import HeatMap
from shapely.geometry import Polygon
from folium.plugins import MarkerCluster, HeatMap
import pandas as pd
from shapely.geometry import Polygon
from geojson import Feature, Point, FeatureCollection, Polygon
import plotly.express as px
from tqdm import tqdm

tqdm.pandas()

interests_df = pd.read_csv("stupino_interests.csv")
locs_df = pd.read_csv("stupino_locs.csv")

Распределение данных по карте¶

Построение диаграммы, сколько в каждом гексагоне карты отметок пользователя

In [3]:
H3_res = 9  # размер гексагона [1 .. 15] чем больше, тем меньше площадь


def geo_to_h3(row):
    return h3.geo_to_h3(lat=row.lat, lng=row.lon, resolution=H3_res)


locs_df['h3_cell'] = locs_df.progress_apply(geo_to_h3, axis=1)
100%|██████████| 10880142/10880142 [03:12<00:00, 56543.06it/s]
In [4]:
locs_df_g = (locs_df
             .groupby('h3_cell')
             .id
             .agg(list)
             .to_frame("ids")
             .reset_index())
# Let's count each points inside the hexagon
locs_df_g['count'] = (locs_df_g['ids']
                      .progress_apply(lambda ignition_ids: len(ignition_ids)))
100%|██████████| 1573/1573 [00:00<00:00, 696467.88it/s]
In [5]:
from shapely.geometry import Polygon


def add_geometry(row):
    points = h3.h3_to_geo_boundary(row['h3_cell'], True)
    return Polygon(points)


#Apply function into our dataframe
locs_df_g['geometry'] = (locs_df_g
                         .progress_apply(add_geometry, axis=1))
100%|██████████| 1573/1573 [00:00<00:00, 9454.89it/s]
In [6]:
def hexagons_dataframe_to_geojson(df_hex, hex_id_field, geometry_field, value_field, file_output=None):
    list_features = []

    for i, row in df_hex.iterrows():
        feature = Feature(geometry=row[geometry_field],
                          id=row[hex_id_field],
                          properties={"value": row[value_field]})
        list_features.append(feature)

    feat_collection = FeatureCollection(list_features)

    if file_output is not None:
        with open(file_output, "w") as f:
            json.dump(feat_collection, f)

    else:
        return feat_collection


geojson_obj = (hexagons_dataframe_to_geojson
               (locs_df_g,
                hex_id_field='h3_cell',
                value_field='count',
                geometry_field='geometry'))
In [172]:
import plotly.express as px

fig = px.choropleth_mapbox(
    locs_df_g,
    geojson=geojson_obj,
    locations='h3_cell',
    color='count',
    color_continuous_scale="Viridis",
    range_color=(0, locs_df_g['count'].mean()),
    mapbox_style='carto-positron',
    zoom=12,
    center={"lat": locs_df.lat.mean(), "lon": locs_df.lon.mean()},
    opacity=0.1,
    labels={'count': 'count of data'})
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
fig.show("notebook")
In [ ]:
 

Возраст¶

In [173]:
ages_df = interests_df[[age for age in interests_df.columns if age.startswith("age_")]]
ages_df.apply(sum).to_frame().style.bar()
Out[173]:
  0
age_18_24 50297
age_25_34 53941
age_35_44 41632
age_45_54 22903
age_17 15314
age_55 27033

Пол¶

In [174]:
gender_df = interests_df[[gender for gender in interests_df.columns if gender.startswith("gender_")]]
gender_df.apply(sum).to_frame().style.bar()
Out[174]:
  0
gender_female 73533
gender_male 73437

Трудоустройство¶

In [175]:
employment_df = interests_df[[user for user in interests_df.columns if user.startswith("employment_")]]
employment_df.apply(sum).to_frame().style.bar()
Out[175]:
  0
employment_working 78881
employment_not_working 26591

Образование¶

In [176]:
interests_df[[user for user in interests_df.columns if user.startswith("availability_of_education_")]].apply(
    sum).to_frame().style.bar()
Out[176]:
  0
availability_of_education_has_a_higher_education 55189
availability_of_education_no_higher_education 64654

Дети¶

In [177]:
interests_df[[user for user in interests_df.columns if user.startswith("children_")]].apply(sum).to_frame().style.bar()
Out[177]:
  0
children_under_16_there_are_children_in_the_family 71838
children_under_16_no_children_in_the_family 78563

Семейное положение¶

In [178]:
interests_df[[user for user in interests_df.columns if user.startswith("marital_")]].apply(sum).to_frame().style.bar()
Out[178]:
  0
marital_status_married 22707
marital_status_not_married 52332

Доход¶

In [179]:
interests_df[[user for user in interests_df.columns if "individual_income_" in user]].apply(sum).to_frame().style.bar()
Out[179]:
  0
individual_income_a_below_average_income 5453
individual_income_b_average_income 11620
individual_income_c_above_average_income 5440
individual_income_d_high_income 2085
individual_income_e_premium 1262
In [180]:
interests_df[[user for user in interests_df.columns if "household_income_" in user]].apply(sum).to_frame().style.bar()
Out[180]:
  0
household_income_a_below_average 3917
household_income_b_average 50697
household_income_c_above_average 86867

Интересы¶

In [181]:
interests_df[[user for user in interests_df.columns if user.startswith("interests_")]].apply(sum).sort_values(
    ascending=False).to_frame().style.bar()
Out[181]:
  0
interests_banks_banking_services 61024
interests_new_buildings 58974
interests_car_owners 55978
interests_b2b_documentary_and_financial_and_legal_support 52628
interests_auto_premium_class 51677
interests_medium_and_large_business 51580
interests_renting_residential_property 50192
interests_b2b_it_for_business 50041
interests_auto_middle_class 50041
interests_auto_economy_class 49551
interests_b2b_raw_materials 48303
interests_resale_property 48132
interests_baby_products 47634
interests_interest_in_buying_a_new_car 47208
interests_contributions_and_deposits 46209
interests_mortgage 45086
interests_loans_for_business 44978
interests_credit_cards 44713
interests_interest_in_insurance 44569
interests_consumer_loans 42902
interests_interest_in_buying_a_new_premium_car 42245
interests_auto_insurance 41112
interests_moto 40302
interests_interest_in_buying_a_new_economy_class_car 39955
interests_interest_in_buying_a_mobile_phone 39740
interests_freight_and_commercial_vehicles 39159
interests_auto_parts_and_service 38710
interests_interest_in_buying_a_new_middle_class_car 38656
interests_all_about_children 37114
interests_overseas_real_estate 36566
interests_wedding 35279
interests_b2b_trade_equipment_and_goods_wholesale 34546
interests_used_cars 34170
interests_b2b_medical_equipment 34113
interests_baby_food 32770
interests_b2b_agriculture 32546
interests_mobile_devices 30739
interests_television_and_video_equipment 29597
interests_special_equipment 29147
interests_car_loans 26554
interests_cell_phones_and_headset 25890
interests_auto_suvs 25263
interests_tires_and_wheels 24931
interests_parents_of_toddlers 23065
interests_b2b_office 20957
interests_telecom_operators 20718
interests_quotes_stock_markets 20256
interests_laptops_and_netbooks 19950
interests_microloans 19630
interests_using_online_banking 19531
interests_pregnancy_and_childbirth 19376
interests_learning_languages 18118
interests_parents_of_middle_and_high_school_students 17960
interests_use_of_electronic_money 17907
interests_of_parents_of_primary_school_students 16924
interests_parents_of_newborns 16849
interests_b2b_equipment_machines_energy_supply 16636
interests_mobile_communications_and_internet_access 16358
interests_photo_and_video_cameras 16136
interests_houses_cottages_and_land_plots 15382
interests_education 14821
interests_small_business 14804
interests_audio_engineering 14345
interests_business_education 13252
interests_parents_of_preschoolers 11619
interests_internet_access 11511
interests_childrens_health 11186
interests_finance_and_accounting 11114
interests_commercial_real_estate 9786
interests_basic 8983
interests_legal_support 8964
interests_preschool 8252
interests_b2b_advertising_and_marketing 6442
interests_higher 5035
interests_auto_electronics_and_gps 5010
interests_tablets_and_ereaders 3532
interests_average 2693
interests_tvs 2620
interests_human_resources 877
interests_specialized_secondary 875
interests_active_mobile_internet_users 859

Построение гистограммы интересов внутри гексагона¶

In [171]:
h3_cell = "89118172457ffff"
h3_cell_from_center = "8911817240fffff"
ids = locs_df_g[locs_df_g["h3_cell"] == h3_cell_from_center]["ids"].to_list()[0]
df = interests_df[[user for user in interests_df.columns if user.startswith("interests_") or "id" in user]]
df = df[df.id.isin(ids)]
df.drop("id", axis=1).apply(sum).sort_values(ascending=False).to_frame().style.bar()
Out[171]:
  0
interests_banks_banking_services 998
interests_new_buildings 975
interests_car_owners 911
interests_baby_products 839
interests_resale_property 812
interests_all_about_children 795
interests_parents_of_toddlers 721
interests_cell_phones_and_headset 719
interests_parents_of_middle_and_high_school_students 688
interests_b2b_documentary_and_financial_and_legal_support 659
interests_of_parents_of_primary_school_students 655
interests_mortgage 649
interests_b2b_office 612
interests_parents_of_newborns 594
interests_medium_and_large_business 594
interests_pregnancy_and_childbirth 572
interests_learning_languages 555
interests_renting_residential_property 530
interests_special_equipment 519
interests_auto_premium_class 510
interests_b2b_it_for_business 491
interests_mobile_communications_and_internet_access 491
interests_auto_parts_and_service 480
interests_baby_food 476
interests_education 468
interests_microloans 458
interests_auto_middle_class 454
interests_moto 432
interests_auto_economy_class 423
interests_television_and_video_equipment 422
interests_interest_in_insurance 419
individual_income_b_average_income 417
interests_houses_cottages_and_land_plots 397
interests_small_business 385
interests_interest_in_buying_a_mobile_phone 370
interests_mobile_devices 369
interests_b2b_raw_materials 367
interests_telecom_operators 322
interests_internet_access 322
interests_interest_in_buying_a_new_car 318
interests_finance_and_accounting 314
interests_basic 309
interests_parents_of_preschoolers 309
interests_auto_suvs 308
interests_freight_and_commercial_vehicles 291
interests_loans_for_business 285
interests_used_cars 280
interests_contributions_and_deposits 277
individual_income_c_above_average_income 260
interests_childrens_health 253
interests_credit_cards 244
interests_commercial_real_estate 241
interests_photo_and_video_cameras 234
interests_tires_and_wheels 231
individual_income_a_below_average_income 221
interests_laptops_and_netbooks 210
interests_audio_engineering 193
interests_b2b_trade_equipment_and_goods_wholesale 186
interests_legal_support 184
interests_use_of_electronic_money 179
interests_consumer_loans 166
interests_higher 166
interests_using_online_banking 156
interests_wedding 151
interests_preschool 143
interests_auto_insurance 143
interests_car_loans 136
interests_auto_electronics_and_gps 132
interests_quotes_stock_markets 126
interests_interest_in_buying_a_new_middle_class_car 125
interests_b2b_equipment_machines_energy_supply 120
interests_overseas_real_estate 119
interests_average 117
interests_interest_in_buying_a_new_premium_car 110
interests_interest_in_buying_a_new_economy_class_car 108
interests_b2b_advertising_and_marketing 106
individual_income_d_high_income 105
interests_business_education 100
interests_tablets_and_ereaders 98
interests_tvs 78
interests_b2b_medical_equipment 70
individual_income_e_premium 54
interests_specialized_secondary 41
interests_human_resources 38
interests_b2b_agriculture 17
interests_active_mobile_internet_users 13

Эмбединги гексагонов¶

  • по геохэшу гексагона получим распределение интересов в отдельном гексагоне
  • возьмём общее распределение по интересам
  • отнормируем распределения
  • вычтем из общего распределения, распределение в данном гексагоне в надежде что таким образом получится убрать общую тенденцию и подчеркнуть локальные интересы
  • минимальный интерес разницы можно считать более интересным людям в данном гексагоне чем в общем случае и его можно рекомендовать
In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("darkgrid")
all_int_df = interests_df[[user for user in interests_df.columns if user.startswith("interests_")]].apply(sum)

# plt.xticks(range(len(new_df)), keys,rotation=90)
In [193]:
ids = locs_df_g[locs_df_g["h3_cell"] == h3_cell_from_center]["ids"].to_list()[0]
df = interests_df[[user for user in interests_df.columns if user.startswith("interests_") or "id" in user]]
df = df[df.id.isin(ids)]
df = df[[user for user in df.columns if user.startswith("interests_")]].apply(sum)
# df = df.drop("id", axis=1).apply(sum)
df
# all_int_df.plot(kind="bar", position=0, width=1.0, rot=90, figsize=(18,5))
# df.plot(kind="bar", position=0, width=1.0, rot=90, figsize=(18,5))
Out[193]:
interests_b2b_advertising_and_marketing                      106
interests_b2b_raw_materials                                  367
interests_b2b_equipment_machines_energy_supply               120
interests_b2b_office                                         612
interests_b2b_documentary_and_financial_and_legal_support    659
                                                            ... 
interests_parents_of_toddlers                                721
interests_parents_of_preschoolers                            309
interests_of_parents_of_primary_school_students              655
interests_parents_of_middle_and_high_school_students         688
interests_business_education                                 100
Length: 81, dtype: int64
In [194]:
ddf = pd.DataFrame({"all": all_int_df, "hex": df})
normalized_df = (ddf - ddf.min()) / (ddf.max() - ddf.min())
normalized_df.plot(kind='bar', alpha=0.7, position=0, width=1.0, rot=90, figsize=(18, 5))
Out[194]:
<AxesSubplot: >
In [195]:
diff = normalized_df["all"] - normalized_df["hex"]
diff.plot(kind='bar', alpha=0.7, position=0, width=1.0, rot=90, figsize=(18, 5))
Out[195]:
<AxesSubplot: >
In [197]:
diff.sort_values(ascending=True).to_frame().style.bar()
Out[197]:
  0
interests_parents_of_middle_and_high_school_students -0.401044
interests_of_parents_of_primary_school_students -0.384761
interests_parents_of_toddlers -0.349697
interests_parents_of_newborns -0.324079
interests_cell_phones_and_headset -0.300712
interests_b2b_office -0.274074
interests_learning_languages -0.263393
interests_pregnancy_and_childbirth -0.259742
interests_education -0.229867
interests_mobile_communications_and_internet_access -0.227671
interests_all_about_children -0.191316
interests_basic -0.165479
interests_houses_cottages_and_land_plots -0.148462
interests_small_business -0.145886
interests_microloans -0.139785
interests_internet_access -0.136659
interests_finance_and_accounting -0.135136
interests_parents_of_preschoolers -0.121666
interests_higher -0.085921
interests_commercial_real_estate -0.083097
interests_average -0.075101
interests_childrens_health -0.072010
interests_baby_products -0.061133
interests_auto_electronics_and_gps -0.051819
interests_special_equipment -0.043532
interests_tablets_and_ereaders -0.041867
interests_legal_support -0.038891
interests_tvs -0.036720
interests_specialized_secondary -0.028160
interests_resale_property -0.025445
interests_human_resources -0.025082
interests_new_buildings -0.010723
interests_preschool -0.009101
interests_b2b_advertising_and_marketing -0.001621
interests_active_mobile_internet_users 0.000000
interests_banks_banking_services 0.000000
interests_car_owners 0.004456
interests_telecom_operators 0.016370
interests_photo_and_video_cameras 0.029553
interests_audio_engineering 0.041409
interests_baby_food 0.060341
interests_television_and_video_equipment 0.062425
interests_mortgage 0.089410
interests_auto_suvs 0.106125
interests_use_of_electronic_money 0.114826
interests_laptops_and_netbooks 0.117311
interests_business_education 0.117659
interests_mobile_devices 0.135213
interests_b2b_equipment_machines_energy_supply 0.153599
interests_auto_parts_and_service 0.155008
interests_using_online_banking 0.165169
interests_tires_and_wheels 0.178780
interests_b2b_documentary_and_financial_and_legal_support 0.204613
interests_quotes_stock_markets 0.207676
interests_moto 0.230200
interests_medium_and_large_business 0.253184
interests_used_cars 0.282595
interests_interest_in_buying_a_mobile_phone 0.283803
interests_renting_residential_property 0.295089
interests_car_loans 0.302202
interests_interest_in_insurance 0.314319
interests_b2b_it_for_business 0.332173
interests_auto_premium_class 0.340075
interests_freight_and_commercial_vehicles 0.354349
interests_auto_middle_class 0.369736
interests_b2b_trade_equipment_and_goods_wholesale 0.384276
interests_auto_economy_class 0.393064
interests_b2b_raw_materials 0.429174
interests_wedding 0.431992
interests_loans_for_business 0.457158
interests_interest_in_buying_a_new_car 0.460720
interests_contributions_and_deposits 0.485740
interests_overseas_real_estate 0.485870
interests_credit_cards 0.494378
interests_b2b_medical_equipment 0.494845
interests_interest_in_buying_a_new_middle_class_car 0.514517
interests_b2b_agriculture 0.522607
interests_auto_insurance 0.537064
interests_consumer_loans 0.543465
interests_interest_in_buying_a_new_economy_class_car 0.553366
interests_interest_in_buying_a_new_premium_car 0.589398
In [ ]:
 
In [86]:
def get_embedding(geohash_h3):
    ids = locs_df_g[locs_df_g["h3_cell"] == geohash_h3]["ids"].to_list()[0]
    df = interests_df[[user for user in interests_df.columns if user.startswith("interests_") or "id" in user]]
    df = df[df.id.isin(ids)]
    df = df[[user for user in df.columns if user.startswith("interests_")]].apply(sum)
    ddf = pd.DataFrame({"all": all_int_df, "hex": df})
    normalized_df = (ddf - ddf.min()) / (ddf.max() - ddf.min())
    diff = normalized_df["all"] - normalized_df["hex"]
    return diff.to_numpy()
hex_df = pd.DataFrame({"h3_geohash":locs_df_g["h3_cell"]})# get_embedding("89118172457ffff")
hex_df["hex_emb"] = hex_df.h3_geohash.progress_apply(lambda geohash: get_embedding(geohash))
100%|██████████| 1573/1573 [00:36<00:00, 42.76it/s]
In [87]:
hex_df
Out[87]:
h3_geohash hex_emb
0 8911810832fffff [0.09279481426078284, 0.12189811352115021, 0.2...
1 8911810836fffff [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...
2 89118108803ffff [0.09279481426078284, -0.21143521981218316, 0....
3 8911810880bffff [0.09279481426078284, 0.7885647801878168, 0.26...
4 8911810880fffff [-0.24053851907255047, 0.4552314468544835, -0....
... ... ...
1568 891181735b7ffff [0.09279481426078284, 0.7885647801878168, 0.26...
1569 891181735bbffff [0.007428960602246251, 0.45319892652928023, 0....
1570 891181735c7ffff [-0.024852244562746567, 0.4356236037172286, 0....
1571 891181735d3ffff [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...
1572 891181735d7ffff [0.05575777722374581, 0.3441203357433724, 0.11...

1573 rows × 2 columns

Кластеризация гексагонов¶

In [199]:
from sklearn.manifold import TSNE
import plotly.express as px
from sklearn.impute import SimpleImputer

features = np.stack(hex_df.hex_emb.values)
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
features = imp.fit_transform(features)

tsne = TSNE(n_components=2, random_state=42)
projections = tsne.fit_transform(features)

fig = px.scatter(
    projections, x=0, y=1,
)
fig.show("notebook")
In [200]:
import matplotlib.pyplot as plt
# from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
In [201]:
kmeans = KMeans(
    init="random",
    n_clusters=3,
    n_init=10,
    max_iter=300,
    random_state=42
)

kmeans.fit(features)
Out[201]:
KMeans(init='random', n_clusters=3, n_init=10, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KMeans(init='random', n_clusters=3, n_init=10, random_state=42)
In [202]:
from sklearn.manifold import TSNE
import seaborn as sns
sns.color_palette("rocket")
from sklearn.cluster import KMeans
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters)
skillsPCA_labels = kmeans.fit_predict(features)

Xtsne = TSNE(n_components=2,random_state=42).fit_transform(features)
dftsne = pd.DataFrame(Xtsne)
dftsne['cluster'] = skillsPCA_labels
dftsne.columns = ['x1','x2','cluster']

sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.scatterplot(data=dftsne,x='x1',y='x2',hue='cluster',legend="full",alpha=0.5)
/home/evg/.config/JetBrains/DataSpell2022.2/projects/workspace/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning:

The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning

Out[202]:
<AxesSubplot: xlabel='x1', ylabel='x2'>
In [ ]:
 
In [203]:
hex_df["cluster"] = dftsne["cluster"]
In [204]:
hex_df
Out[204]:
h3_geohash hex_emb cluster
0 8911810832fffff [0.09279481426078284, 0.12189811352115021, 0.2... 1
1 8911810836fffff [nan, nan, nan, nan, nan, nan, nan, nan, nan, ... 1
2 89118108803ffff [0.09279481426078284, -0.21143521981218316, 0.... 1
3 8911810880bffff [0.09279481426078284, 0.7885647801878168, 0.26... 3
4 8911810880fffff [-0.24053851907255047, 0.4552314468544835, -0.... 1
... ... ... ...
1568 891181735b7ffff [0.09279481426078284, 0.7885647801878168, 0.26... 1
1569 891181735bbffff [0.007428960602246251, 0.45319892652928023, 0.... 0
1570 891181735c7ffff [-0.024852244562746567, 0.4356236037172286, 0.... 1
1571 891181735d3ffff [nan, nan, nan, nan, nan, nan, nan, nan, nan, ... 1
1572 891181735d7ffff [0.05575777722374581, 0.3441203357433724, 0.11... 0

1573 rows × 3 columns

Отрисовка кластеров на карте¶

In [205]:
fig = px.choropleth_mapbox(
    hex_df,
    geojson=geojson_obj,
    locations='h3_geohash',
    color='cluster',
    color_continuous_scale="Viridis",
    range_color=(0,n_clusters-1),
    mapbox_style='carto-positron',
    zoom=12,
    center={"lat": locs_df.lat.mean(), "lon": locs_df.lon.mean()},
    opacity=0.1,
    labels={'count': 'count of data'})
fig.update_layout(margin={"r": 0, "t": 1, "l": 2, "b": 3})
fig.show("notebook")
In [ ]:
 
In [ ]:
 
In [ ]: